In [1]:
import pandas as pd
import numpy as np
import time
import operator
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss, f1_score, accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
In [2]:
trn = pd.read_csv("../input/train_clean.csv")
target = pd.read_csv("../input/train.csv", usecols=["target"])
tst = pd.read_csv("../input/test_clean.csv")
test_id = tst["ncodpers"]
tst.drop(["ncodpers"], axis=1, inplace=True)
trn.drop(["ncodpers"], axis=1, inplace=True)
print(trn.shape, target.shape, tst.shape)
In [3]:
print(trn.info(), target.info(), tst.info())
In [4]:
trn.columns == tst.columns
Out[4]:
In [5]:
for col in trn.columns:
if trn[col].dtype == "object":
print(col)
In [6]:
for col in trn.columns:
if trn[col].dtype == "object":
lb = LabelEncoder()
lb.fit(pd.concat([trn[col], tst[col]]))
trn[col] = lb.transform(trn[col])
tst[col] = lb.transform(tst[col])
In [7]:
for col in trn.columns:
print(col, trn[col].dtype, tst[col].dtype)
In [8]:
for t in np.unique(target):
print(t, sum(target["target"]==t))
In [9]:
# 빈도가 적은 데이터 제거
In [10]:
rem_targets = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 21, 22, 23] # 18 classes
trn = trn[target["target"].isin(rem_targets)]
target = target[target["target"].isin(rem_targets)]
target = LabelEncoder().fit_transform(target)
In [13]:
# target["target"].isin
In [14]:
def evaluate(x, y, model):
trn_scores = dict(); vld_scores = dict()
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.1, random_state=777)
for t_ind, v_ind in sss.split(x,y):
# split data
x_trn, x_vld = x.iloc[t_ind], x.iloc[v_ind]
y_trn, y_vld = y[t_ind], y[v_ind]
# fit model
model.fit(x_trn, y_trn)
# eval _ trn
preds = model.predict(x_trn)
acc_scores = trn_scores.get('accuracy', [])
acc_scores.append(accuracy_score(y_trn, preds))
trn_scores['accuracy'] = acc_scores
f1_scores = trn_scores.get('f1 score', [])
f1_scores.append(f1_score(y_trn, preds, average='weighted'))
trn_scores['f1 score'] = f1_scores
preds = model.predict_proba(x_trn)
log_scores = trn_scores.get('log loss', [])
log_scores.append(log_loss(y_trn, preds))
trn_scores['log loss'] = log_scores
# eval _ vld
preds = model.predict(x_vld)
acc_scores = vld_scores.get('accuracy', [])
acc_scores.append(accuracy_score(y_vld, preds))
vld_scores['accuracy'] = acc_scores
f1_scores = vld_scores.get('f1 score', [])
f1_scores.append(f1_score(y_vld, preds, average='weighted'))
vld_scores['f1 score'] = f1_scores
preds = model.predict_proba(x_vld)
log_scores = vld_scores.get('log loss', [])
log_scores.append(log_loss(y_vld, preds))
vld_scores['log loss'] = log_scores
return trn_scores, vld_scores
def print_scores(trn_scores, vld_scores):
prefix = ' '
cols = ['accuracy', 'f1 score','log loss']
print('='*50)
print('TRAIN EVAL')
for col in cols:
print('-'*50)
print('# {}'.format(col))
print('# {} Mean : {}'.format(prefix, np.mean(trn_scores[col])))
print('# {} Raw : {}'.format(prefix, trn_scores[col]))
print('='*50)
print('VALID EVAL')
for col in cols:
print('-'*50)
print('# {}'.format(col))
print('# {} Mean : {}'.format(prefix, np.mean(vld_scores[col])))
print('# {} Raw : {}'.format(prefix, vld_scores[col]))
def print_time(end, start):
print('='*50)
elapsed = end - start
print('{} secs'.format(round(elapsed)))
def fit_and_eval(trn, target, model):
trn_scores, vld_scores = evaluate(trn,target,model)
print_scores(trn_scores, vld_scores)
print_time(time.time(), st)
In [15]:
st = time.time()
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(n_jobs=-1, random_state=777)
fit_and_eval(trn, target, model)
# 58 sec
In [25]:
# Utility
def observe_model_lr(model):
target_num = 0
print('='*50)
print(model)
print('='*50)
print('# Coefficients for target_num == {}'.format(target_num))
print(model.coef_[target_num])
print('-'*50)
print('# Mapped to Column Name')
prefix = ' '
coefs = dict()
for i, coef in enumerate(model.coef_[target_num]):
print('{} {} \t {}'.format(prefix, round(coef,5), trn.columns[i]))
coefs[trn.columns[i]] = np.absolute(coef)
print('-'*50)
print('# Sorted Feature Importance')
coefs_sorted = sorted(coefs.items(), key=operator.itemgetter(1), reverse=True)
for item in coefs_sorted:
print('{} {} \t {}'.format(prefix, round(item[1],5), item[0]))
return coefs_sorted
def plot_coef(coef):
x = []; y = []
for item in coef:
x.append(item[0])
y.append(item[1])
f, ax = plt.subplots(figsize=(20, 15))
sns.barplot(x,y,alpha=0.5)
ax.set_title('Feature Importance for Model : Logistic Regression')
ax.set(xlabel='Column Name', ylabel='Feature Importance')
In [26]:
# 모델 상세 보기
coef = observe_model_lr(model)
In [27]:
# 주요 변수 시각화
plot_coef(coef)
In [29]:
trn.head()
Out[29]:
In [30]:
trn["age"] = (trn["age"]/10).astype(int)
In [33]:
tst["age"] = (tst["age"]/10).astype(int)
In [ ]:
# indresi, conyuemp, ult_fec_cli_1t 빈도수가 적으니 제거